package com.buluo;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Component;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

@Component
@Scope("prototype")
public class BuluoGuanKaPageProcesser implements PageProcessor {
	
	boolean  flag = false;
	
	@Override
    public void process(Page page) {
		
		
 		List<String> links = new ArrayList<String>();
		
		if(!flag){
			links = page.getHtml().xpath("//div[@class='m_center']").links().regex("http://wodebuluo\\.gamedog\\.cn/gonglue/\\d+/\\d+\\.html").all();
	        page.addTargetRequests(links);
	        flag = true;
		}
		
        
        String url = page.getRequest().getUrl();
        
        String regEx="[^0-9]";   
    	Pattern p = Pattern.compile(regEx);   
    	Matcher m = p.matcher(url);
    	String idStr = m.replaceAll("").trim();
        
        page.putField("id", idStr);
        page.putField("title", page.getHtml().xpath("//div[@class='main1']/div[@class='info_left']/div[@class='wen']/h1/text()").toString());
        page.putField("content", page.getHtml().xpath("//div[@class='main1']/div[@class='info_left']/div[@class='wen']/div[@class='news_neirong']/html()").toString());
        page.putField("date", page.getHtml().xpath("//div[@class='main1']/div[@class='info_left']/div[@class='wen']/div[@class='newsa']/span/text()").toString());
    }

    @Override
    public Site getSite() {
        return Site.me().setDomain("wodebuluo.gamedog.cn").
                setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
    }

	public void setFlag(boolean flag) {
		this.flag = flag;
	}
    
}
